package com.algo.jsyq;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.algo.segment.Segmenter;

/**
 * according to labeled articles, get new QingGanCiKu<br/>
 * the articles must comply specific format: <br/>
 * 		1. utf-8 encoding <br/>
 * 		2. 1st line is url, 2nd line is title, 3th line is entity,
 * 			4th line is NEGATIVE/POSITIVE label, 5th line is positive keywords,
 * 			6th line is negative keywords, 7th and after is article content
 * 		3. keywords are separated with ' ' or '\t'
 * 
 * @author lujianfeng@miaozhen.com
 * 
 */

public class QingGanCiKu {

	//save word statistical information
	private int positiveDocumentNumber;
	private int negativeDocumentNumber;
	private int neutralDocumentNumber;
	
	//used to save keywords DF
	private Map<String, Integer> positiveWordsDf = new HashMap<String, Integer>();
	private Map<String, Integer> negativeWordsDf = new HashMap<String, Integer>();
	//used to save word and DF
	private HashMap<String, WordTendency> wordsTendency = new HashMap<String, WordTendency>();
	
	//used to segment Chinese words
	private Segmenter segmenter;
	
	//labeled in document
	public final static String POSITIVE = "正面";
	public final static String NEGATIVE = "负面";
	public final static String NEUTRAL = "中性";
	
	//labeled in document, indicating no correlating keywords
	public final static String NO = "无";
	
	//for label error correction, FANMIAN -> NEGATIVE
	public final static String FANMIAN = "反面";
	
	//save average values for bayes smoothing
	private double positiveaAvgKeyDf, negativeAvgKeyDf, positiveaAvgTitleDf, negativeAvgTitleDf, 
		neutralAvgTitleDf, positiveaAvgDf, negativeAvgDf, neutralAvgDf;
	private void countAverageDf(){
		for(Integer n : positiveWordsDf.values())
			positiveaAvgKeyDf += n;
		positiveaAvgKeyDf /= wordsTendency.size();
		
		for(Integer n : negativeWordsDf.values())
			negativeAvgKeyDf += n;
		negativeAvgKeyDf /= wordsTendency.size();
		
		for(WordTendency wt : wordsTendency.values()){
			positiveaAvgTitleDf += wt.getPositiveTitleDf();
			negativeAvgTitleDf += wt.getNegativeTitleDf();
			neutralAvgTitleDf += wt.getNeutralTitleDf();
			positiveaAvgDf += wt.getPositiveDf();
			negativeAvgDf += wt.getNegativeDf();
			neutralAvgDf += wt.getNeutralDf();
		}
		positiveaAvgTitleDf /= wordsTendency.size();
		negativeAvgTitleDf /= wordsTendency.size();
		neutralAvgTitleDf /= wordsTendency.size();
		positiveaAvgDf /= wordsTendency.size();
		negativeAvgDf /= wordsTendency.size();
		neutralAvgDf /= wordsTendency.size();
		
		System.out.println(positiveaAvgKeyDf+","+ negativeAvgKeyDf+","+ positiveaAvgTitleDf+","+ negativeAvgTitleDf+","
				+ neutralAvgTitleDf+","+ positiveaAvgDf + "," + negativeAvgDf + "," + neutralAvgDf);
		
		//ignore document number difference
		positiveDocumentNumber = 400;
		negativeDocumentNumber = 400;
		neutralDocumentNumber = 400;
	}
	//count word tendency score according to DF with bayes smoothing
	private String countTendencyScore(String word, WordTendency tendency){
		int w1 = 12, w2 = 5, w3 = 3, ws = 0;
		int pdf = positiveWordsDf.getOrDefault(word, 0);
		int ndf = negativeWordsDf.getOrDefault(word, 0);
		
		double positive = positiveDocumentNumber*(pdf + positiveaAvgKeyDf);
		double negative = negativeDocumentNumber*(ndf + negativeAvgKeyDf);
		double s1 = 0;
		if((pdf + ndf) != 0){
			s1 = (positive - negative)/(positive + negative);
			ws += w1;
		}
		
		pdf = tendency.getPositiveTitleDf();
		ndf = tendency.getNegativeTitleDf();
		int udf = tendency.getNeutralTitleDf();
		positive = positiveDocumentNumber*(pdf + positiveaAvgTitleDf);
		negative = negativeDocumentNumber*(ndf + negativeAvgTitleDf);
		double neutral = neutralDocumentNumber*(udf + neutralAvgTitleDf);
		double s2 = 0;
		if((pdf + ndf) != 0){
			s2 = (positive - negative)/(positive + negative + neutral);
			ws += w2;
		}
		
		pdf = tendency.getPositiveDf();
		ndf = tendency.getNegativeDf();
		udf = tendency.getNeutralDf();
		positive = positiveDocumentNumber*(pdf + positiveaAvgDf);
		negative = negativeDocumentNumber*(ndf + negativeAvgDf);
		neutral = neutralDocumentNumber*(udf + neutralAvgDf);
		double s3 = 0;
		if((pdf + ndf) != 0){
			s3 = (positive - negative)/(positive + negative + neutral);
			ws += w3;
		}
		
		double s = 0;
		if(ws != 0)
			s = (w1*s1 + w2*s2 + w3*s3)/ws;
		return String.format("%1.6f,%1.6f,%1.6f,%1.6f", s1, s2, s3, s);
	}
	
	private class WordTendency{
		private int positiveTf, positiveDf, positiveTitleTf, positiveTitleDf;
		private int negativeTf, negativeDf, negativeTitleTf, negativeTitleDf;
		private int neutralTf, neutralDf, neutralTitleTf, neutralTitleDf;
		
		public void incrementTf(String positiveOrNegative){
			if(positiveOrNegative.equals(POSITIVE))
				positiveTf++;
			else if(positiveOrNegative.equals(NEGATIVE))
				negativeTf++;
			else if(positiveOrNegative.equals(NEUTRAL))
				neutralTf++;
			else throw new RuntimeException("ERROR: label error");
		}
		public int getPositiveDf() {
			return positiveDf;
		}
		public int getPositiveTitleDf() {
			return positiveTitleDf;
		}
		public int getNegativeDf() {
			return negativeDf;
		}
		public int getNegativeTitleDf() {
			return negativeTitleDf;
		}
		public int getNeutralDf() {
			return neutralDf;
		}
		public int getNeutralTitleDf() {
			return neutralTitleDf;
		}
		public void incrementDf(String positiveOrNegative){
			if(positiveOrNegative.equals(POSITIVE))	positiveDf++;
			else if(positiveOrNegative.equals(NEGATIVE))	negativeDf++;
			else if(positiveOrNegative.equals(NEUTRAL))
				neutralDf++;
			else throw new RuntimeException("ERROR: label error");
		}
		public void incrementTitleTf(String positiveOrNegative){
			if(positiveOrNegative.equals(POSITIVE))	positiveTitleTf++;
			else if(positiveOrNegative.equals(NEGATIVE)) negativeTitleTf++;
			else if(positiveOrNegative.equals(NEUTRAL))
				neutralTitleTf++;
			else throw new RuntimeException("ERROR: label error");
		}
		public void incrementTitleDf(String positiveOrNegative){
			if(positiveOrNegative.equals(POSITIVE))	positiveTitleDf++;
			else if(positiveOrNegative.equals(NEGATIVE))	negativeTitleDf++;
			else if(positiveOrNegative.equals(NEUTRAL))
				neutralTitleDf++;
			else throw new RuntimeException("ERROR: label error");
		}
		public String toString(){
			return positiveTf +","+ positiveDf+","+ negativeTf+","+  negativeDf +","+  positiveTitleTf+","+  
					positiveTitleDf +","+negativeTitleTf+","+ negativeTitleDf +","+ 
					neutralTf+","+ neutralDf+","+ neutralTitleTf+","+ neutralTitleDf;
		}
	}
	
	public QingGanCiKu(String dict){
		segmenter = new Segmenter(dict);
	}
	private void incrementDocumentNumber(String positiveOrNegative){
		if(positiveOrNegative.equals(POSITIVE))	positiveDocumentNumber++;
		else if(positiveOrNegative.equals(NEGATIVE))	negativeDocumentNumber++;
		else if(positiveOrNegative.equals(NEUTRAL))
			neutralDocumentNumber++;
		else throw new RuntimeException("ERROR: label error");
	}
	private void throughLabelledFile(String path){
		try {
			BufferedReader br = new BufferedReader(new FileReader(path));
			
			Set<String> words = new HashSet<String>();
			br.readLine();//ignore the url
			List<String> titleWords = segmenter.segment(br.readLine());
			//br.readLine();	//ignore entity
			//dealing label error, don't have entity
			String positiveOrNegative = br.readLine().trim();
			if(positiveOrNegative.equals(FANMIAN)) positiveOrNegative = NEGATIVE;
			if(!(positiveOrNegative.equals(NEGATIVE) || positiveOrNegative.equals(NEUTRAL) || positiveOrNegative.equals(POSITIVE))){
				positiveOrNegative = br.readLine().trim();
				System.out.println(path);
			}
			if(!(positiveOrNegative.equals(NEGATIVE) || positiveOrNegative.equals(NEUTRAL) || positiveOrNegative.equals(POSITIVE))){
					System.out.println("ERROR");
					return;
			}
			//add title words
			for(String word : titleWords){
				WordTendency wt = wordsTendency.get(word);
				if(wt == null){
					wt = new WordTendency();
					wordsTendency.put(word, wt);
				}
				wt.incrementTitleTf(positiveOrNegative);
				words.add(word);
			}
			for(String word : words){
				WordTendency wt = wordsTendency.get(word);
				wt.incrementTitleDf(positiveOrNegative);
			}
			words.clear();
			//add positive key words
			for(String positiveKeyWord : br.readLine().split("[ \\t]+"))
				if(!positiveKeyWord.equals(NO))
					positiveWordsDf.put(positiveKeyWord, 1 + positiveWordsDf.getOrDefault(positiveKeyWord, 0));
			//add negative key words
			for(String negativeKeyWord : br.readLine().split("[ \\t]+"))
				if(!negativeKeyWord.equals(NO))
					negativeWordsDf.put(negativeKeyWord, 1 + negativeWordsDf.getOrDefault(negativeKeyWord, 0));
			//add document words
			String line;
			while((line = br.readLine()) != null){
				for(String word : segmenter.segment(line)){
					WordTendency wt = wordsTendency.get(word);
					if(wt == null){
						wt = new WordTendency();
						wordsTendency.put(word, wt);
					}
					wt.incrementTf(positiveOrNegative);
					words.add(word);
				}
			}
			for(String word : words){
				WordTendency wt = wordsTendency.get(word);
				wt.incrementDf(positiveOrNegative);
			}
			incrementDocumentNumber(positiveOrNegative);
			br.close();
		} catch (IOException e) {
			throw new RuntimeException(e);
		}
	}
	private void saveInfoToFile(String out){
		try {
			BufferedWriter bw = new BufferedWriter(new FileWriter(out));
			//bw.write(positiveDocumentNumber + "," + negativeDocumentNumber+","+neutralDocumentNumber+"\n");
			for(Map.Entry<String, WordTendency> entry : wordsTendency.entrySet()){
				String word = entry.getKey();
				if(word.matches("[^\u4E00-\u9FA5]"))	continue;
				bw.write(word + ","+entry.getValue().toString());
				bw.write("," + positiveWordsDf.getOrDefault(word, 0) + "," + negativeWordsDf.getOrDefault(word, 0));
				bw.write("," + countTendencyScore(word, entry.getValue()));
				Double score = oldCiKu.get(word);
				if(score != null)	bw.write("," + score);
				bw.write("\n");
			}
			bw.close();
		} catch (IOException e) {
			throw new RuntimeException(e);
		}
	}
	
	Map<String, Double> oldCiKu = new HashMap<String, Double>();
	private void readOldCiKu(String[] path){
		for(String file : path){
			try {
				BufferedReader br = new BufferedReader(new FileReader(file));
				String line;
				while((line = br.readLine()) != null){
					String[] items = line.split("[ \\t]");
					if(items.length == 2)
						oldCiKu.put(items[0], Double.parseDouble(items[1]));
					else System.out.println("CiKu:" + line);
				}
				br.close();
			} catch (IOException e) {
				throw new RuntimeException(e);
			}
		}
	}
	
	/**
	 * according to labeled articles, get new QingGanCiKu<br/>
	 * the articles must comply specific format: <br/>
	 * 		1. utf-8 encoding <br/>
	 * 		2. 1st line is url, 2nd line is title, 3th line is entity,
	 * 			4th line is NEGATIVE/POSITIVE label, 5th line is positive keywords,
	 * 			6th line is negative keywords, 7th and after is article content
	 * 		3. keywords are separated with ' ' or '\t'
	 */
	public static void GenerateQingGanCiKu(String[] paths, String[] oldCiKu){
		QingGanCiKu qingGan = new QingGanCiKu(paths[0]);
		for(String path : FindNewWords.getAllFiles(new File(paths[1]))){
			qingGan.throughLabelledFile(path);
		}
		qingGan.countAverageDf();
		
		qingGan.readOldCiKu(oldCiKu);
		qingGan.saveInfoToFile(paths[2]);
	}
	
	public static void main(String[] argsq) {
		//for example
		
		String[] paths = {
				"D://过去的工作//江苏银行爬虫//sentiment//src//main//resources//user_dict00.new",
				"C://Users//supertool//Documents//Tencent Files//419284651//FileRecv//dataCollection2",
				"D://过去的工作//江苏银行爬虫//sentiment//src//main//resources//qgout.csv"
		};
		
		
		String[] ciku = {"D://过去的工作//江苏银行爬虫//sentiment//src//main//resources//dict//negative_word.dic",
				"D://过去的工作//江苏银行爬虫//sentiment//src//main//resources//dict//positive_word.dic"};
		
		
		GenerateQingGanCiKu(paths, ciku);
		
	}

}
